import pandas as pd
import numpy as np
import os
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
df = pd.read_csv('supply_chain_data.csv')
df['Product type'] = df['Product type'].replace({
'skincare': 'credit card',
'haircare': 'charge card',
'cosmetics': 'co-branded card'
})
df.head()
| Product type | SKU | Price | Availability | Number of products sold | Revenue generated | Customer demographics | Stock levels | Lead times | Order quantities | ... | Location | Lead time | Production volumes | Manufacturing lead time | Manufacturing costs | Inspection results | Defect rates | Transportation modes | Routes | Costs | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | charge card | SKU0 | 69.808006 | 55 | 802 | 8661.996792 | Non-binary | 58 | 7 | 96 | ... | Mumbai | 29 | 215 | 29 | 46.279879 | Pending | 0.226410 | Road | Route B | 187.752075 |
| 1 | credit card | SKU1 | 14.843523 | 95 | 736 | 7460.900065 | Female | 53 | 30 | 37 | ... | Mumbai | 23 | 517 | 30 | 33.616769 | Pending | 4.854068 | Road | Route B | 503.065579 |
| 2 | charge card | SKU2 | 11.319683 | 34 | 8 | 9577.749626 | Unknown | 1 | 10 | 88 | ... | Mumbai | 12 | 971 | 27 | 30.688019 | Pending | 4.580593 | Air | Route C | 141.920282 |
| 3 | credit card | SKU3 | 61.163343 | 68 | 83 | 7766.836426 | Non-binary | 23 | 13 | 59 | ... | Kolkata | 24 | 937 | 18 | 35.624741 | Fail | 4.746649 | Rail | Route A | 254.776159 |
| 4 | credit card | SKU4 | 4.805496 | 26 | 871 | 2686.505152 | Non-binary | 5 | 3 | 56 | ... | Delhi | 5 | 414 | 3 | 92.065161 | Fail | 3.145580 | Air | Route A | 923.440632 |
5 rows × 24 columns
df.describe()
| Price | Availability | Number of products sold | Revenue generated | Stock levels | Lead times | Order quantities | Shipping times | Shipping costs | Lead time | Production volumes | Manufacturing lead time | Manufacturing costs | Defect rates | Costs | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 100.000000 | 100.000000 | 100.000000 | 100.000000 | 100.000000 | 100.000000 | 100.000000 | 100.000000 | 100.000000 | 100.000000 | 100.000000 | 100.00000 | 100.000000 | 100.000000 | 100.000000 |
| mean | 49.462461 | 48.400000 | 460.990000 | 5776.048187 | 47.770000 | 15.960000 | 49.220000 | 5.750000 | 5.548149 | 17.080000 | 567.840000 | 14.77000 | 47.266693 | 2.277158 | 529.245782 |
| std | 31.168193 | 30.743317 | 303.780074 | 2732.841744 | 31.369372 | 8.785801 | 26.784429 | 2.724283 | 2.651376 | 8.846251 | 263.046861 | 8.91243 | 28.982841 | 1.461366 | 258.301696 |
| min | 1.699976 | 1.000000 | 8.000000 | 1061.618523 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.013487 | 1.000000 | 104.000000 | 1.00000 | 1.085069 | 0.018608 | 103.916248 |
| 25% | 19.597823 | 22.750000 | 184.250000 | 2812.847151 | 16.750000 | 8.000000 | 26.000000 | 3.750000 | 3.540248 | 10.000000 | 352.000000 | 7.00000 | 22.983299 | 1.009650 | 318.778455 |
| 50% | 51.239831 | 43.500000 | 392.500000 | 6006.352023 | 47.500000 | 17.000000 | 52.000000 | 6.000000 | 5.320534 | 18.000000 | 568.500000 | 14.00000 | 45.905622 | 2.141863 | 520.430444 |
| 75% | 77.198228 | 75.000000 | 704.250000 | 8253.976921 | 73.000000 | 24.000000 | 71.250000 | 8.000000 | 7.601695 | 25.000000 | 797.000000 | 23.00000 | 68.621026 | 3.563995 | 763.078231 |
| max | 99.171329 | 100.000000 | 996.000000 | 9866.465458 | 100.000000 | 30.000000 | 96.000000 | 10.000000 | 9.929816 | 30.000000 | 985.000000 | 30.00000 | 99.466109 | 4.939255 | 997.413450 |
import plotly.graph_objs as go
from plotly.subplots import make_subplots
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(
go.Scatter(x=df['SKU'], y=df['Availability'], mode='lines', name='Availability', line=dict(color='blue')),
secondary_y=False,
)
fig.add_trace(
go.Scatter(x=df['SKU'], y=df['Number of products sold'], mode='lines', name='Number of products sold', line=dict(color='green')),
secondary_y=True,
)
fig.update_layout(
title=dict(text='Availability and Number of Products Sold by SKU', x=0.5),
xaxis_title='SKU',
yaxis_title='Availability',
yaxis2_title='Number of Products Sold',
font=dict(family="Arial", size=14),
# legend_title_text='Metric',
legend=dict(orientation='h', yanchor='bottom', y=1.02, xanchor='center', x=0.5),
width=1070,
height=600
)
fig.show()
fig_stock_order = make_subplots(specs=[[{"secondary_y": True}]])
fig_stock_order.add_trace(
go.Scatter(x=df['SKU'], y=df['Stock levels'], mode='lines', name='Stock levels', line=dict(color='blue')),
secondary_y=False,
)
fig_stock_order.add_trace(
go.Scatter(x=df['SKU'], y=df['Order quantities'], mode='lines', name='Order quantities', line=dict(color='green')),
secondary_y=True,
)
fig_stock_order.update_layout(
title=dict(text='Stock Levels and Order Quantities by SKU', x=0.5),
xaxis_title='SKU',
yaxis_title='Stock Levels',
yaxis2_title='Order Quantities',
font=dict(family="Arial", size=14),
legend=dict(orientation='h', yanchor='bottom', y=1.02, xanchor='center', x=0.5),
width=1070,
height=600
)
fig_stock_order.show()
defect_rates_by_product = df.groupby("Product type")['Defect rates'].mean().reset_index()
fig = px.bar(defect_rates_by_product, x='Product type', y='Defect rates', title='Defect Rates by Product Type')
color_scale = px.colors.sequential.Turbo
fig.update_traces(marker_color=color_scale)
fig.update_layout(
xaxis_title="Product Type",
yaxis_title="Mean Defect Rates",
xaxis=dict(categoryorder='total descending'),
yaxis=dict(title='Mean Defect Rates'),
plot_bgcolor='white',
title_x=0.5,
showlegend=True
)
fig.show()
pivot_table = pd.pivot_table(df, values='Defect rates',
index=['Transportation modes'],
aggfunc='mean')
transportation_chart = px.pie(values=pivot_table["Defect rates"],
names=pivot_table.index,
title='Defect Rates by Transportation Mode',
hole=0.5,
color_discrete_sequence=px.colors.sequential.Turbo)
transportation_chart.update_layout(title={'text': 'Defect Rates by Transportation Mode', 'x': 0.5})
transportation_chart.show()
shipping_cost_revenue = df.groupby(['Shipping carriers'])[['Shipping costs', 'Revenue generated']].sum().reset_index()
shipping_cost_revenue_sorted_cost = shipping_cost_revenue.sort_values(by='Shipping costs', ascending=False)
shipping_cost_revenue_sorted_revenue = shipping_cost_revenue.sort_values(by='Revenue generated', ascending=False)
fig = make_subplots(rows=1, cols=2, subplot_titles=("Total Shipping Cost by Shipping Carrier", "Total Revenue by Shipping Carrier"))
fig.add_trace(
go.Bar(x=shipping_cost_revenue_sorted_cost['Shipping carriers'], y=shipping_cost_revenue_sorted_cost['Shipping costs'],
name='Shipping Cost', marker_color=px.colors.sequential.Blues),
row=1, col=1
)
fig.add_trace(
go.Bar(x=shipping_cost_revenue_sorted_revenue['Shipping carriers'], y=shipping_cost_revenue_sorted_revenue['Revenue generated'],
name='Revenue Generated', marker_color=px.colors.sequential.Greens),
row=1, col=2
)
fig.update_layout(
title=dict(text='Shipping Cost and Revenue by Shipping Carrier', x=0.5),
font=dict(family="Arial", size=14),
showlegend=False
)
fig.update_xaxes(title_text="Shipping Carrier", tickangle=-45, row=1, col=1)
fig.update_xaxes(title_text="Shipping Carrier", tickangle=-45, row=1, col=2)
fig.update_yaxes(title_text="Total Shipping Cost", row=1, col=1)
fig.update_yaxes(title_text="Total Revenue", row=1, col=2)
fig.show()
avg_lead_times_transport = df.groupby(['Transportation modes'])['Lead times'].mean().reset_index()
avg_lead_times_transport = avg_lead_times_transport.sort_values(by='Lead times', ascending=False)
avg_lead_times_transport['Lead times'] = avg_lead_times_transport['Lead times'].round(2)
avg_shipping_times_transport = df.groupby(['Transportation modes'])['Shipping times'].mean().reset_index()
avg_shipping_times_transport = avg_shipping_times_transport.sort_values(by='Shipping times', ascending=False)
avg_shipping_times_transport['Shipping times'] = avg_shipping_times_transport['Shipping times'].round(2)
fig = make_subplots(rows=1, cols=2, subplot_titles=("Average Lead Times by Transportation Mode", "Average Shipping Times by Transportation Mode"))
fig.add_trace(
go.Bar(x=avg_lead_times_transport['Lead times'], y=avg_lead_times_transport['Transportation modes'],
orientation='h', name='Lead Times', marker=dict(color=avg_lead_times_transport['Lead times'], colorscale='Blues'),
text=avg_lead_times_transport['Lead times'], textposition='auto'),
row=1, col=1
)
fig.add_trace(
go.Bar(x=avg_shipping_times_transport['Shipping times'], y=avg_shipping_times_transport['Transportation modes'],
orientation='h', name='Shipping Times', marker=dict(color=avg_shipping_times_transport['Shipping times'], colorscale='Blues'),
text=avg_shipping_times_transport['Shipping times'], textposition='auto'),
row=1, col=2
)
fig.update_layout(
width=1070,
height=600,
title_text="Average Lead and Shipping Times by Transportation Mode", # Overall title
title_x=0.5,
showlegend=False,
font=dict(family="Arial", size=14),
)
fig.update_xaxes(title_text="Average Lead Time", row=1, col=1)
fig.update_yaxes(title_text="Transportation Mode", row=1, col=1)
fig.update_xaxes(title_text="Average Shipping Time", row=1, col=2)
fig.update_yaxes(title_text="Transportation Mode", row=1, col=2)
fig.show()
avg_lead_times_carrier = df.groupby(['Shipping carriers'])['Lead times'].mean().reset_index()
avg_lead_times_carrier = avg_lead_times_carrier.sort_values(by='Lead times', ascending=False)
avg_lead_times_carrier['Lead times'] = avg_lead_times_carrier['Lead times'].round(2)
avg_shipping_times_carrier = df.groupby(['Shipping carriers'])['Shipping times'].mean().reset_index()
avg_shipping_times_carrier = avg_shipping_times_carrier.sort_values(by='Shipping times', ascending=False)
avg_shipping_times_carrier['Shipping times'] = avg_shipping_times_carrier['Shipping times'].round(2)
fig = make_subplots(rows=1, cols=2, subplot_titles=("Average Lead Times by Shipping Carrier", "Average Shipping Times by Shipping Carrier"))
fig.add_trace(
go.Bar(x=avg_lead_times_carrier['Lead times'], y=avg_lead_times_carrier['Shipping carriers'],
orientation='h', name='Lead Times', marker=dict(color=avg_lead_times_carrier['Lead times'], colorscale='Blues'),
text=avg_lead_times_carrier['Lead times'], textposition='auto'),
row=1, col=1
)
fig.add_trace(
go.Bar(x=avg_shipping_times_carrier['Shipping times'], y=avg_shipping_times_carrier['Shipping carriers'],
orientation='h', name='Shipping Times', marker=dict(color=avg_shipping_times_carrier['Shipping times'], colorscale='Blues'),
text=avg_shipping_times_carrier['Shipping times'], textposition='auto'),
row=1, col=2
)
fig.update_layout(
width=1070,
height=600,
title_text="Average Lead and Shipping Times by Shipping Carrier", # Overall title
title_x=0.5,
showlegend=False,
font=dict(family="Arial", size=14),
)
fig.update_xaxes(title_text="Average Lead Time", row=1, col=1)
fig.update_yaxes(title_text="Shipping Carrier", row=1, col=1)
fig.update_xaxes(title_text="Average Shipping Time", row=1, col=2)
fig.update_yaxes(title_text="Shipping Carrier", row=1, col=2)
fig.show()
df['Order_Stock_Ratio'] = df['Stock levels'] / df['Order quantities']
df_sub = df[df['Order_Stock_Ratio'] <= 15]
fig = px.scatter(df_sub, x='Order_Stock_Ratio', y='Lead times',
labels={'Order_Stock_Ratio': 'Stock to Order Ratio', 'Lead times': 'Lead Time'},
title='How Stock to Order Ratio is related to Lead Time',
trendline='lowess')
fig.update_layout(
xaxis_title="Stock to Order Ratio",
yaxis_title="Lead Time",
font=dict(family="Arial", size=14),
title_x=0.5
)
fig.show()
revenue_avg_by_demo_prod = df.groupby(['Customer demographics', 'Product type'])['Revenue generated'].mean().reset_index()
revenue_sum_by_demo_prod = df.groupby(['Customer demographics', 'Product type'])['Revenue generated'].sum().reset_index()
colors = px.colors.sequential.deep
fig = make_subplots(rows=1, cols=2, subplot_titles=('Average Revenue', 'Total Revenue'))
for i, product_type in enumerate(revenue_avg_by_demo_prod['Product type'].unique()):
subset = revenue_avg_by_demo_prod[revenue_avg_by_demo_prod['Product type'] == product_type]
fig.add_trace(
go.Bar(
x=subset['Customer demographics'],
y=subset['Revenue generated'],
name=product_type,
marker_color=colors[i]
),
row=1, col=1
)
for i, product_type in enumerate(revenue_sum_by_demo_prod['Product type'].unique()):
subset = revenue_sum_by_demo_prod[revenue_sum_by_demo_prod['Product type'] == product_type]
fig.add_trace(
go.Bar(
x=subset['Customer demographics'],
y=subset['Revenue generated'],
showlegend=False,
marker_color=colors[i]
),
row=1, col=2
)
fig.update_layout(
title='Revenue Analysis by Customer Demographics and Product Type',
xaxis=dict(title='Customer Demographics'),
yaxis=dict(title='Revenue'),
xaxis2=dict(title='Customer Demographics'),
yaxis2=dict(title='Revenue'),
title_x=0.5,
showlegend=True
)
fig.show()
sales_data = df.groupby('Product type')['Number of products sold'].sum().reset_index()
pie_chart = px.pie(sales_data, values='Number of products sold', names='Product type',
title='Sales by Product Type',
hover_data=['Number of products sold'],
hole=0.5,
color_discrete_sequence=px.colors.sequential.Turbo)
pie_chart.update_traces(textposition='inside', textinfo='percent+label')
pie_chart.update_layout(title={'text': 'Sales by Product Type', 'x': 0.5})
pie_chart.show()
from sklearn.preprocessing import LabelEncoder
df_transformed = df.copy()
label_encoders = {}
for column in df_transformed.select_dtypes(include=['object']).columns:
label_encoders[column] = LabelEncoder()
df_transformed[column] = label_encoders[column].fit_transform(df_transformed[column])
X = df_transformed.loc[:,['Product type', 'SKU', 'Price', 'Availability',
'Revenue generated', 'Customer demographics',
'Stock levels', 'Lead times', 'Order quantities', 'Shipping times',
'Shipping carriers', 'Shipping costs', 'Supplier name', 'Location',
'Lead time', 'Production volumes', 'Manufacturing lead time',
'Manufacturing costs', 'Inspection results', 'Defect rates',
'Transportation modes', 'Routes', 'Costs']]
y = df_transformed.loc[:,'Number of products sold']
num_folds = 10
mse_scores = []
rmse_scores = []
mae_scores = []
r2_scores = []
actual_values = np.zeros(len(y))
predicted_values = np.zeros(len(y))
from sklearn.model_selection import KFold
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)
params = {
'objective': 'regression',
'metric': 'mean_squared_error',
'boosting_type': 'gbdt',
'num_leaves': 62,
'learning_rate': 0.05,
'feature_fraction': 0.9,
'early_stopping_round': 5,
'verbose' : -1
}
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
for train_index, test_index in kf.split(X):
X_train, X_test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
num_round = 100
early_stopping_rounds = 5
bst = lgb.train(
params,
train_data,
num_round,
valid_sets=[test_data]
)
y_pred = bst.predict(X_test, num_iteration=bst.best_iteration)
actual_values[test_index] = y_test
predicted_values[test_index] = y_pred
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
target_range = np.max(y_test) - np.min(y_test)
percentage_mse = (mse / target_range) * 100
percentage_rmse = (rmse / target_range) * 100
percentage_mae = (mae / target_range) * 100
percentage_r2 = (r2 * 100)
mse_scores.append(percentage_mse)
rmse_scores.append(percentage_rmse)
mae_scores.append(percentage_mae)
r2_scores.append(percentage_r2)
plot_df = pd.DataFrame({
'SKU': df['SKU'],
'Actual': actual_values,
'Predicted': predicted_values
})
plot_df = plot_df.sort_values(by='SKU')
fig = go.Figure()
fig.add_trace(go.Scatter(x=plot_df['SKU'], y=plot_df['Actual'], mode='lines', name='Actual'))
fig.add_trace(go.Scatter(x=plot_df['SKU'], y=plot_df['Predicted'], mode='lines', name='Predicted'))
fig.update_layout(
title='Actual vs Predicted Number of Products Sold by SKU',
xaxis_title='SKU',
yaxis_title='Number of Products Sold',
font=dict(family="Arial", size=14),
title_x=0.5
)
fig.show()
avg_mse = np.mean(mse_scores)
avg_rmse = np.mean(rmse_scores)
avg_mae = np.mean(mae_scores)
avg_r2 = np.mean(r2_scores)
print(f"Average Mean Squared Error: {avg_mse:.2f}%")
print(f"Average Root Mean Squared Error: {avg_rmse:.2f}%")
print(f"Average Mean Absolute Error: {avg_mae:.2f}%")
print(f"Average R-squared: {avg_r2:.2f}%")
Average Mean Squared Error: 10389.38% Average Root Mean Squared Error: 34.65% Average Mean Absolute Error: 31.02% Average R-squared: -4.38%
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
import xgboost as xgb
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
data = pd.read_csv('supply_chain_data.csv')
data.columns = data.columns.str.strip()
numerical_cols = ['Price', 'Availability', 'Revenue generated', 'Stock levels',
'Lead times', 'Order quantities', 'Shipping times', 'Shipping costs',
'Lead time', 'Production volumes', 'Manufacturing lead time',
'Manufacturing costs', 'Defect rates', 'Costs']
categorical_cols = ['Product type', 'SKU', 'Customer demographics', 'Shipping carriers',
'Supplier name', 'Location', 'Inspection results',
'Transportation modes', 'Routes']
numerical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
transformers=[
('num', numerical_transformer, numerical_cols),
('cat', categorical_transformer, categorical_cols)
])
model = RandomForestRegressor(n_estimators=100, random_state=42)
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
('model', model)])
X = data.drop('Number of products sold', axis=1)
y = data['Number of products sold']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)
'''# Predictions and evaluation
target_range = np.max(y_test) - np.min(y_test)
percentage_mse = (mse / target_range) * 100
percentage_rmse = (rmse / target_range) * 100
percentage_mae = (mae / target_range) * 100
percentage_r2 = (r2 * 100)'''
models = {
'Linear Regression': LinearRegression(),
'Gradient Boosting': GradientBoostingRegressor(random_state=42),
'XGBoost': xgb.XGBRegressor(random_state=42),
'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42)
}
def evaluate_model(model, X_train, X_test, y_train, y_test):
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
('model', model)])
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
target_range = np.max(y_test) - np.min(y_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
percentage_mse = (mse / target_range) * 100
percentage_rmse = (rmse / target_range) * 100
percentage_mae = (mae / target_range) * 100
percentage_r2 = (r2 * 100)
return mse, rmse, mae, r2
results = {}
for model_name, model in models.items():
results[model_name] = evaluate_model(model, X_train, X_test, y_train, y_test)
results_df = pd.DataFrame(results, index=['MAE', 'MSE', 'RMSE', 'R2']).T
results_df
| MAE | MSE | RMSE | R2 | |
|---|---|---|---|---|
| Linear Regression | 198499.007684 | 445.532275 | 399.193096 | -1.080015 |
| Gradient Boosting | 165178.180824 | 406.421186 | 353.680217 | -0.730855 |
| XGBoost | 191217.040688 | 437.283707 | 378.533047 | -1.003709 |
| Random Forest | 158605.649115 | 398.253247 | 357.678500 | -0.661984 |
Feature Engineering:
Hyperparameter Tuning:
num_leaves, learning_rate, max_depth, etc.